{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "TimeGAN_Blocks.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "dDMwaNg-lIF6"
      },
      "source": [
        "# imports \n",
        "import numpy as np\n",
        "import torch \n",
        "from torch import nn\n",
        "from torch import functional as F\n",
        "import os"
      ],
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 36
        },
        "id": "TOFqrL6SnXqs",
        "outputId": "fd322ee5-8a17-4e68-d9b1-9353400f8f82"
      },
      "source": [
        "os.getcwd()"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'/content/TimeGAN'"
            ]
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "M8HmVjNjnfHK"
      },
      "source": [
        "os.chdir(\"/content/TimeGAN/\")"
      ],
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4m_31dcnpYOS"
      },
      "source": [
        "def Normalize(dta):\n",
        "  return (dta - np.min(dta, 0)) /  (np.max(dta, 0) - np.min(dta, 0) + 1e-7)"
      ],
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aCA9XQ9XlRy8"
      },
      "source": [
        "def real_data_loading (path, seq_len):\n",
        "  \"\"\"Load and preprocess real-world datasets.\n",
        "  \n",
        "  Args:\n",
        "    - data_name: stock or energy\n",
        "    - seq_len: sequence length\n",
        "    \n",
        "  Returns:\n",
        "    - data: preprocessed data.\n",
        "  \"\"\"  \n",
        "  assert os.path.isfile(path)\n",
        "  ori_data = np.loadtxt(path, delimiter = \",\",skiprows = 1)\n",
        "        \n",
        "  # Flip the data to make chronological data\n",
        "  ori_data = ori_data[::-1]\n",
        "  # Normalize the data\n",
        "  ori_data = Normalize(ori_data)\n",
        "    \n",
        "  # Preprocess the dataset\n",
        "  temp_data = []    \n",
        "  # Cut data by sequence length\n",
        "  for i in range(0, len(ori_data) - seq_len):\n",
        "    _x = ori_data[i:i + seq_len]\n",
        "    # yields an array of dims [len(dta) - seq_len, seq_len, n_variables]\n",
        "    temp_data.append(_x)\n",
        "        \n",
        "  # Mix the datasets (to make it similar to i.i.d)\n",
        "  idx = np.random.permutation(len(temp_data))    \n",
        "  data = []\n",
        "  for i in range(len(temp_data)):\n",
        "    data.append(temp_data[idx[i]])\n",
        "    \n",
        "  return data"
      ],
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZtYMYYEupxyr"
      },
      "source": [
        "# taken from the tutorial\n",
        "seq_len = 24\n",
        "path = \"/content/TimeGAN/stock_data.csv\"\n",
        "ori_data = real_data_loading(path, seq_len)"
      ],
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6CYXHquCqH6u",
        "outputId": "44a9eb72-ba13-4b57-e7f4-54a0d4b51e4e"
      },
      "source": [
        "np.array(ori_data).shape"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(3661, 24, 6)"
            ]
          },
          "metadata": {},
          "execution_count": 22
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0SYnkoi0qIwM"
      },
      "source": [
        "def train_test_divide (data_x, data_x_hat, data_t, data_t_hat, train_rate = 0.8):\n",
        "  \"\"\"Divide train and test data for both original and synthetic data.\n",
        "  \n",
        "  Args:\n",
        "    - data_x: original data\n",
        "    - data_x_hat: generated data\n",
        "    - data_t: original time\n",
        "    - data_t_hat: generated time\n",
        "    - train_rate: ratio of training data from the original data\n",
        "  \"\"\"\n",
        "  # Divide train/test index (original data)\n",
        "  no = len(data_x)\n",
        "  idx = np.random.permutation(no)\n",
        "  train_idx = idx[:int(no*train_rate)]\n",
        "  test_idx = idx[int(no*train_rate):]\n",
        "    \n",
        "  train_x = [data_x[i] for i in train_idx]\n",
        "  test_x = [data_x[i] for i in test_idx]\n",
        "  train_t = [data_t[i] for i in train_idx]\n",
        "  test_t = [data_t[i] for i in test_idx]      \n",
        "    \n",
        "  # Divide train/test index (synthetic data)\n",
        "  no = len(data_x_hat)\n",
        "  idx = np.random.permutation(no)\n",
        "  train_idx = idx[:int(no*train_rate)]\n",
        "  test_idx = idx[int(no*train_rate):]\n",
        "  \n",
        "  train_x_hat = [data_x_hat[i] for i in train_idx]\n",
        "  test_x_hat = [data_x_hat[i] for i in test_idx]\n",
        "  train_t_hat = [data_t_hat[i] for i in train_idx]\n",
        "  test_t_hat = [data_t_hat[i] for i in test_idx]\n",
        "  \n",
        "  return train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat"
      ],
      "execution_count": 27,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KBmKYZFBqrFt"
      },
      "source": [
        "def extract_time (data):\n",
        "  \"\"\"Returns Maximum sequence length and each sequence length.\n",
        "  \n",
        "  Args:\n",
        "    - data: original data\n",
        "    \n",
        "  Returns:\n",
        "    - time: extracted time information\n",
        "    - max_seq_len: maximum sequence length\n",
        "  \"\"\"\n",
        "  time = list()\n",
        "  max_seq_len = 0\n",
        "  for i in range(len(data)):\n",
        "    max_seq_len = max(max_seq_len, len(data[i][:,0]))\n",
        "    time.append(len(data[i][:,0]))\n",
        "    \n",
        "  return time, max_seq_len"
      ],
      "execution_count": 28,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "f471bsmbqstA"
      },
      "source": [
        "seq_lengths, max_seq_len = extract_time(ori_data)"
      ],
      "execution_count": 35,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cKF15q0ArWQ2",
        "outputId": "2061c372-b7b7-4fa9-b8cd-70d572baa20f"
      },
      "source": [
        "np.array(seq_lengths).shape"
      ],
      "execution_count": 36,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(3661,)"
            ]
          },
          "metadata": {},
          "execution_count": 36
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mRWHfnXuuAfD"
      },
      "source": [
        "def batch_generator(data, time, batch_size):\n",
        "  \"\"\"Mini-batch generator.\n",
        "  \n",
        "  Args:\n",
        "    - data: time-series data\n",
        "    - time: time information\n",
        "    - batch_size: the number of samples in each batch\n",
        "    \n",
        "  Returns:\n",
        "    - X_mb: time-series data in each batch\n",
        "    - T_mb: time information in each batch\n",
        "  \"\"\"\n",
        "  no = len(data)\n",
        "  idx = np.random.permutation(no)\n",
        "  train_idx = idx[:batch_size]     \n",
        "            \n",
        "  X_mb = list(data[i] for i in train_idx)\n",
        "  T_mb = list(time[i] for i in train_idx)\n",
        "  \n",
        "  return X_mb, T_mb"
      ],
      "execution_count": 39,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Sha1JERtuCUj"
      },
      "source": [
        "ori_time, max_seq_len = extract_time(ori_data)\n",
        "no, seq_len, dim = np.asarray(ori_data).shape\n",
        "batch_size = 128\n",
        "z_dim = dim"
      ],
      "execution_count": 49,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9qwcQtOEsw1i"
      },
      "source": [
        "def random_generator (batch_size, z_dim, T_mb, max_seq_len):\n",
        "  \"\"\"Random vector generation.\n",
        "  \n",
        "  Args:\n",
        "    - batch_size: size of the random vector\n",
        "    - z_dim: dimension of random vector\n",
        "    - T_mb: time information for the random vector\n",
        "    - max_seq_len: maximum sequence length\n",
        "    \n",
        "  Returns:\n",
        "    - Z_mb: generated random vector\n",
        "  \"\"\"\n",
        "  Z_mb = list()\n",
        "  for i in range(batch_size):\n",
        "    temp = np.zeros([max_seq_len, z_dim])\n",
        "    temp_Z = np.random.uniform(0., 1, [T_mb[i], z_dim])\n",
        "    temp[:T_mb[i],:] = temp_Z\n",
        "    Z_mb.append(temp_Z)\n",
        "  return Z_mb"
      ],
      "execution_count": 37,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NfVRanFZtaX6"
      },
      "source": [
        "X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size) "
      ],
      "execution_count": 42,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xwsbnEYgtk3L",
        "outputId": "6c44e6e0-0326-4a18-a638-61a7273ef2ce"
      },
      "source": [
        "np.array(X_mb).shape, np.array(T_mb).shape\n",
        "# we feed batches of 128 sequences each into the network"
      ],
      "execution_count": 45,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "((128, 24, 6), (128,))"
            ]
          },
          "metadata": {},
          "execution_count": 45
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iZuKU1Blux8j",
        "outputId": "0df274ff-5074-45a9-e5de-0d1ff1157b47"
      },
      "source": [
        "Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len)\n",
        "np.array(Z_mb).shape"
      ],
      "execution_count": 51,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(128, 24, 6)"
            ]
          },
          "metadata": {},
          "execution_count": 51
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wu0Iq7eovA7i"
      },
      "source": [
        "# X = tf.placeholder(tf.float32, [None, max_seq_len, dim], name = \"myinput_x\")\n",
        "# Z = tf.placeholder(tf.float32, [None, max_seq_len, z_dim], name = \"myinput_z\")\n",
        "# T = tf.placeholder(tf.int32, [None], name = \"myinput_t\")\n",
        "# X is the original data => Flexible batch size, 24, 6 for stock data\n",
        "# Z is the generated fake data => Flex batch size, 24, 6 for stock data\n",
        "# T is the time information => Flexible batch size, here 128 for the time data\n",
        "# the embedding network uses \n",
        "num_layers = 2\n",
        "hidden_dim = 24\n",
        "seq_len = 24\n",
        "# i think input size should be 24 for the stock data\n",
        "input_size = 6\n",
        "device = \"cpu\"\n",
        "batch_size = 128\n",
        "\n",
        "# from the RNN documentation: \n",
        "# arguments: inputsize = num_features, hidden_size, num_layers\n",
        "# rnn = nn.RNN(6, 24, 2, batch_first=True)\n",
        "# batch seq features 128, 24, 6\n",
        "# input: batch, length, hidden\n",
        "# input = torch.randn(128, 24, 6)\n",
        "# num layers, batch, hidden size\n",
        "# h0 = torch.randn(2, 128, 24)\n",
        "# output, hn = rnn(input, h0)\n",
        "\n",
        "class RNN(nn.Module):\n",
        "  def __init__(self, input_size, hidden_dim, num_layers):\n",
        "    super(RNN, self).__init__()\n",
        "    self.num_layers = num_layers\n",
        "    self.hidden_dim = hidden_dim\n",
        "    # batch must be first dimension, inputsize = 6 for tme data\n",
        "    # arguments: inputsize = num_features, hidden_size, num_layers\n",
        "    # rnn = nn.RNN(6, 24, 2, batch_first=True)\n",
        "    self.rnn = nn.RNN(input_size, hidden_dim, num_layers, batch_first = True)\n",
        "    # X => batch_size, seq_length, num_features like specified above\n",
        "    self.fc = nn.Linear(hidden_dim, hidden_dim)\n",
        "    self.nonlinearity = nn.Sigmoid()\n",
        "\n",
        "  # rnn needs two inputs: data & initial state\n",
        "  def forward(self, x):\n",
        "    # num layers, batch, hidden size\n",
        "    # h0 = torch.randn(2, 128, 24)\n",
        "    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()\n",
        "    out, hn = self.rnn(x, h0)\n",
        "    # out: batch_size, seq_len, hidden_dim\n",
        "    # out = out[:, -1, :]\n",
        "    # out (128, 24)\n",
        "    out = self.fc(out)\n",
        "    out = self.nonlinearity(out)\n",
        "    return out"
      ],
      "execution_count": 163,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bPxm_1hl3YpV",
        "outputId": "e790981f-aab5-4896-86ac-f61d44411dad"
      },
      "source": [
        "# testing the RNN module\n",
        "net = RNN(6, 24, 5)\n",
        "# converting everything to float to avoid data type runtime errors\n",
        "net.float()\n",
        "\n",
        "# using one minibatch as example data\n",
        "dta = np.array(X_mb)\n",
        "dta.astype(np.float32)\n",
        "dta = torch.from_numpy(dta)\n",
        "\n",
        "net(dta.float()).shape\n",
        "# yields 128 x 24 embeddings => 128 samples in batch, every embedding has 24 features\n",
        "# => we embed in a higher dimensional space in this case, as mentioned in the paper"
      ],
      "execution_count": 164,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "torch.Size([128, 24, 24])"
            ]
          },
          "metadata": {},
          "execution_count": 164
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yJ3PBirp3aKa"
      },
      "source": [
        "# lets modify the code to add lstm and gru \n",
        "class Custom_RNN(nn.Module):\n",
        "  def __init__(self, input_size, hidden_dim, num_layers, rnn_type, orig_dim = 24, embed = True):\n",
        "    super(Custom_RNN, self).__init__()\n",
        "    self.num_layers = num_layers\n",
        "    self.hidden_dim = hidden_dim\n",
        "    self.rnn_type = rnn_type\n",
        "    self.embed = embed\n",
        "    # batch must be first dimension, inputsize = 6 for tme data\n",
        "    # arguments: inputsize = num_features, hidden_size, num_layers\n",
        "    # rnn = nn.RNN(6, 24, 2, batch_first=True)\n",
        "    if self.rnn_type == \"rnn\":\n",
        "      self.net = nn.RNN(input_size, hidden_dim, num_layers, batch_first = True)\n",
        "    elif self.rnn_type == \"gru\": # gru uses batch_size, seq_length, inputsize too\n",
        "      self.net = nn.GRU(input_size, hidden_dim, num_layers, batch_first = True)\n",
        "    elif self.rnn_type == \"lstm\": # input params still the same for lstm\n",
        "      self.net = nn.LSTM(input_size, hidden_dim, num_layers, batch_first = True)\n",
        "    # X => batch_size, seq_length, num_features like specified above\n",
        "    if self.embed == True: # add distinction between embedding & recovery\n",
        "      self.fc = nn.Linear(hidden_dim, hidden_dim)\n",
        "    else:\n",
        "      self.fc = nn.Linear(hidden_dim, orig_dim)\n",
        "    self.nonlinearity = nn.Sigmoid()\n",
        "\n",
        "  # rnn needs two inputs: data & initial state\n",
        "  def forward(self, x):\n",
        "    # num layers, batch, hidden size\n",
        "    # h0 = torch.randn(2, 128, 24)\n",
        "    if self.rnn_type in [\"rnn\", \"gru\"]:\n",
        "      h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()\n",
        "      out, hn = self.net(x, h0)\n",
        "    elif self.rnn_type == \"lstm\": # additional initial cell state has same shape\n",
        "      h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()\n",
        "      c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device).float()\n",
        "      out, hn = self.net(x, (h0, c0))\n",
        "    # out: batch_size, seq_len, hidden_dim\n",
        "    # out = out[:, -1, :]\n",
        "    # out (128, 24)\n",
        "    # apply nonlinearity like in original implementation\n",
        "    out = self.fc(out)\n",
        "    out = self.nonlinearity(out)\n",
        "    return out"
      ],
      "execution_count": 175,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tdIy0HR3_Tku",
        "outputId": "08fab5d9-928c-4db3-ce1f-8401fc4ffb38"
      },
      "source": [
        "# testing the RNN module\n",
        "net = Custom_RNN(6, 24, 5, rnn_type=\"lstm\")\n",
        "# converting everything to float to avoid data type runtime errors\n",
        "net.float()\n",
        "\n",
        "# using one minibatch as example data\n",
        "dta = np.array(X_mb)\n",
        "dta.astype(np.float32)\n",
        "dta = torch.from_numpy(dta)\n",
        "\n",
        "net(dta.float()).shape\n",
        "# yields 128 x 24 embeddings => 128 samples in batch, every embedding has 24 features\n",
        "# => we embed in a higher dimensional space in this case, as mentioned in the paper\n",
        "# everything seems to work until here"
      ],
      "execution_count": 176,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "torch.Size([128, 24, 24])"
            ]
          },
          "metadata": {},
          "execution_count": 176
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dAJqK43882JY"
      },
      "source": [
        "def embedder(X, hidden_dim, num_layers, rnn_type): # sequence length is only important for dynamic rnns \n",
        "  \"\"\"Embedding network between original feature space to latent space.\n",
        "    \n",
        "    Args:\n",
        "      - X: input time-series features (batch of input data)\n",
        "      - hidden_dim: Hidden dimension of RNN\n",
        "      - num_layers: Number of RNN layers\n",
        "      - rnn_type: type of rnn [rnn, gru, lstm]\n",
        "\n",
        "    Returns:\n",
        "      - H: embeddings\n",
        "  \"\"\"\n",
        "  input_size = np.array(X).shape[2] # number of features [128, 24, 6]\n",
        "  input = np.array(X)\n",
        "  input.astype(np.float32)\n",
        "  input = torch.from_numpy(input)\n",
        "\n",
        "  net = Custom_RNN(input_size, hidden_dim, num_layers, rnn_type)\n",
        "  net.float()\n",
        "  H = net(input.float())\n",
        "  return H"
      ],
      "execution_count": 177,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "CliayKUr9MKc",
        "outputId": "1f5e580a-9cc1-4a74-c869-628c93ff6e02"
      },
      "source": [
        "H = embedder(X_mb, 20, 2, \"lstm\")\n",
        "H.shape"
      ],
      "execution_count": 178,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "torch.Size([128, 24, 20])"
            ]
          },
          "metadata": {},
          "execution_count": 178
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aNZKYO5AAsTD"
      },
      "source": [
        "def recovery(H, hidden_dim, num_layers, rnn_type, orig_dim):   \n",
        "  \"\"\"Recovery network from latent space to original space.\n",
        "    \n",
        "  Args:\n",
        "    - H: latent representation\n",
        "    - hidden_dim: Hidden dimension of RNN\n",
        "    - num_layers: Number of RNN layers\n",
        "    - rnn_type: type of rnn [rnn, gru, lstm] \n",
        "    - orig_dim: original data dimensionality to recover \n",
        "  Returns:\n",
        "    - X_tilde: recovered data\n",
        "  \"\"\"            \n",
        "  input_size = H.shape[2]\n",
        "\n",
        "  net = Custom_RNN(input_size, hidden_dim, num_layers, rnn_type, orig_dim, embed=False)\n",
        "  net.float()\n",
        "  X_tilde = net(H)\n",
        "  return X_tilde"
      ],
      "execution_count": 184,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NlrU-2NNiuxt",
        "outputId": "bfef4fab-2e65-49f5-8e25-acc3581aacfa"
      },
      "source": [
        "X_tilde = recovery(H, 20, 2, \"lstm\", orig_dim=6)\n",
        "X_tilde.shape"
      ],
      "execution_count": 185,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "torch.Size([128, 24, 6])"
            ]
          },
          "metadata": {},
          "execution_count": 185
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2yi1SAMWkBVx",
        "outputId": "f83a2a6c-c9c4-47d2-989d-46f1cbf3a5d8"
      },
      "source": [
        ""
      ],
      "execution_count": 183,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "torch.Size([128, 24, 20])"
            ]
          },
          "metadata": {},
          "execution_count": 183
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NpCC3l64kCEC"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
